suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(parallel)
library(pander)
library(plotly)
library(RColorBrewer)
library(tidyverse)
library(tximport)
library(vsn)
})
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv(here("doc/variables.csv"),
col_types=cols(
col_character(),
col_factor(),
col_factor(),
col_factor(),
col_factor()
))
tx2gene translation table
tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.txt"),delim="\t",
col_names=c("TXID","GENE")))
filelist <- list.files(here("data/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Sanity check to ensure that the data is sorted according to the sample info
names(filelist) <- sub("_S\\d+.*","",basename(dirname(filelist)))
stopifnot(all(samples$ID==names(filelist)))
Read the expression at the gene level
This gives us directly gene counts
counts <- suppressMessages(round(tximport(files = filelist,
type = "salmon",
tx2gene=tx2gene)$counts))
# counts <- summarizeToGene(tx,tx2gene=tx2gene)
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "17.8% percent (5766) of 32310 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=TISSUE)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90,size=4),axis.title.x=element_blank())
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5766 rows containing non-finite values (stat_density).
Also removing P14065_119
ggplot(data.frame(value=log10(rowMeans(counts[,colnames(counts)!="P14065_119"]))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5769 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by CHANGEME.
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(TISSUE=samples$TISSUE[match(ind,samples$ID)]) %>%
mutate(TIME=samples$TIME[match(ind,samples$ID)]) %>%
mutate(TREATMENT=samples$TREATMENT[match(ind,samples$ID)])
ggplot(dat,aes(x=values,group=ind,col=TISSUE)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 399816 rows containing non-finite values (stat_density).
Removing P14065_119
dat <- as.data.frame(log10(counts[,colnames(counts)!="P14065_119"])) %>% utils::stack() %>%
mutate(TISSUE=samples$TISSUE[match(ind,samples$ID)]) %>%
mutate(TIME=samples$TIME[match(ind,samples$ID)]) %>%
mutate(TREATMENT=samples$TREATMENT[match(ind,samples$ID)])
ggplot(dat,aes(x=values,group=ind,col=TISSUE)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 373852 rows containing non-finite values (stat_density).
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write_csv(as.data.frame(counts) %>% rownames_to_column("ID"),
here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ TISSUE * CONDITION)
## converting counts to integer mode
dds <- dds[,-match("P14065_119",colnames(dds))]
save(dds,file=here("data/analysis/salmon/dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
There is some variation -/+ 50%, but that’s acceptable
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P14065_101 | P14065_102 | P14065_103 | P14065_104 | P14065_105 | P14065_106 |
|---|---|---|---|---|---|
| 0.8441 | 0.5639 | 0.7496 | 0.7239 | 1.015 | 0.8578 |
| P14065_107 | P14065_108 | P14065_109 | P14065_110 | P14065_111 | P14065_112 |
|---|---|---|---|---|---|
| 1.061 | 0.5505 | 0.7329 | 0.6929 | 0.4535 | 0.9757 |
| P14065_113 | P14065_114 | P14065_115 | P14065_116 | P14065_117 | P14065_118 |
|---|---|---|---|---|---|
| 0.9765 | 0.9171 | 0.6607 | 1.735 | 1.498 | 0.9708 |
| P14065_120 | P14065_121 | P14065_122 | P14065_123 | P14065_124 | P14065_125 |
|---|---|---|---|---|---|
| 1.14 | 1.818 | 0.9036 | 0.978 | 1.913 | 1.655 |
| P14065_126 | P14065_127 | P14065_128 | P14065_129 | P14065_130 | P17901_119 |
|---|---|---|---|---|---|
| 1.053 | 1.06 | 1.105 | 1.285 | 1.122 | 0.9718 |
| P17901_120 | P17901_121 | P17901_122 | P17901_123 | P17901_124 |
|---|---|---|---|---|
| 0.9308 | 0.8531 | 1.435 | 1.723 | 1.436 |
boxplot(sizes, main="Sequencing libraries size factor")
abline(h=1,lty=2,col="grey")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(vst)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
We define the number of variable of the model
nvar=2
An the number of possible combinations
nlevel=nlevels(dds$TISSUE) * nlevels(dds$CONDITION)
We plot the percentage explained by the different components, the red line represent the number of variable in the model, the orange line the number of variable combinations.
ggplot(tibble(x=1:length(percent),y=cumsum(percent)),aes(x=x,y=y)) +
geom_line() + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component") +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",size=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",size=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",size=0.5)
The most variance comes from the tissues. In the leaf, the time has more effect than the treatment, while this look the opposite in the apex.
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
as.data.frame(colData(dds)))
p <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=CONDITION,shape=TISSUE,text=ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise
conds <- factor(paste(dds$TISSUE,dds$CONDITION))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
vst.cutoff <- 2
hm <- heatmap.2(t(scale(t(vst[sels[[vst.cutoff+1]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
plot(as.hclust(hm$colDendrogram),xlab="",sub="",labels=paste(dds$ID,conds),cex=0.6)
The main differences in the leaf samples are according to the days of sampling. The new T1 dexa samples cluster in T1 with a cluster of T1 havin both mock and Dexa. It would seem generally that the effect of Dexa vs. Mock in leaf at T1 is minimal.
For appices, the situation is somewhat different, the time and treatment seem to be both of importance. T0 samples merged with T1 dexa while T1 mock form its own cluster, possibly indicating a delay induced by the treatment. The newer samples form their own clusters, possibly indicating that a batch effect (most likely due to sampling rather than sequencing as the leaf sample do not show such an effect) has a stronger influence than the biological signal. At T3 there is a clear separation between mock and dexa.
With regards to the batch observed in the new appices samples, it is however probably marginal as it is not visible in the first 2 dimension of the PCA, that contribute 80% of the variance. So it will be within the whole dataset at most 3%. If anything, assuming the effect is due to the sampling, hence biological noise, it would only make the comparison more robust.
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.58.0 tximport_1.18.0
## [3] forcats_0.5.0 stringr_1.4.0
## [5] dplyr_1.0.3 purrr_0.3.4
## [7] readr_1.4.0 tidyr_1.1.2
## [9] tibble_3.0.5 tidyverse_1.3.0
## [11] RColorBrewer_1.1-2 plotly_4.9.3
## [13] pander_0.6.3 hyperSpec_0.99-20201127
## [15] xml2_1.3.2 ggplot2_3.3.3
## [17] lattice_0.20-41 here_1.0.1
## [19] gplots_3.1.1 DESeq2_1.30.0
## [21] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [23] MatrixGenerics_1.2.0 matrixStats_0.57.0
## [25] GenomicRanges_1.42.0 GenomeInfoDb_1.26.2
## [27] IRanges_2.24.1 S4Vectors_0.28.1
## [29] BiocGenerics_0.36.0 data.table_1.13.6
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1 rprojroot_2.0.2
## [4] XVector_0.30.0 fs_1.5.0 rstudioapi_0.13
## [7] hexbin_1.28.2 farver_2.0.3 affyio_1.60.0
## [10] bit64_4.0.5 AnnotationDbi_1.52.0 fansi_0.4.2
## [13] lubridate_1.7.9.2 splines_4.0.3 geneplotter_1.68.0
## [16] knitr_1.30 jsonlite_1.7.2 broom_0.7.3
## [19] annotate_1.68.0 dbplyr_2.0.0 png_0.1-7
## [22] BiocManager_1.30.10 compiler_4.0.3 httr_1.4.2
## [25] backports_1.2.1 assertthat_0.2.1 Matrix_1.3-2
## [28] lazyeval_0.2.2 limma_3.46.0 cli_2.2.0
## [31] htmltools_0.5.1 tools_4.0.3 affy_1.68.0
## [34] gtable_0.3.0 glue_1.4.2 GenomeInfoDbData_1.2.4
## [37] Rcpp_1.0.6 cellranger_1.1.0 vctrs_0.3.6
## [40] preprocessCore_1.52.1 crosstalk_1.1.1 xfun_0.20
## [43] testthat_3.0.1 rvest_0.3.6 lifecycle_0.2.0
## [46] gtools_3.8.2 XML_3.99-0.5 zlibbioc_1.36.0
## [49] scales_1.1.1 hms_1.0.0 yaml_2.2.1
## [52] memoise_1.1.0 latticeExtra_0.6-29 stringi_1.5.3
## [55] RSQLite_2.2.2 highr_0.8 genefilter_1.72.0
## [58] caTools_1.18.1 BiocParallel_1.24.1 rlang_0.4.10
## [61] pkgconfig_2.0.3 bitops_1.0-6 evaluate_0.14
## [64] labeling_0.4.2 htmlwidgets_1.5.3 bit_4.0.4
## [67] tidyselect_1.1.0 magrittr_2.0.1 R6_2.5.0
## [70] generics_0.1.0 DelayedArray_0.16.0 DBI_1.1.1
## [73] pillar_1.4.7 haven_2.3.1 withr_2.4.0
## [76] survival_3.2-7 RCurl_1.98-1.2 modelr_0.1.8
## [79] crayon_1.3.4 KernSmooth_2.23-18 rmarkdown_2.6
## [82] jpeg_0.1-8.1 locfit_1.5-9.4 readxl_1.3.1
## [85] blob_1.2.1 reprex_0.3.0 digest_0.6.27
## [88] xtable_1.8-4 munsell_0.5.0 viridisLite_0.3.0